{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download ACS 5-Year Estimates\n",
    "\n",
    "This notebook downloads American Community Survey (ACS) 5-Year Estimate data\n",
    "from the U.S. Census Bureau API for table groups listed in `../Input/ACSGroups_Initial.csv`.\n",
    "\n",
    "Data is downloaded at three geographic levels:\n",
    "- **ZIP Code (ZCTA)**: All ZCTAs in Alabama (`ACS_ALZipCodes.csv`)\n",
    "- **County**: All counties in Alabama (`ACS_ALCounties.csv`)\n",
    "- **State**: All U.S. states, DC, and Puerto Rico (`ACS_USStates.csv`)\n",
    "\n",
    "Each output CSV has one row per location and one column per ACS variable, plus\n",
    "`LandArea` and `PopulationDensity` columns. States and counties are identified\n",
    "by name; ZIP codes use their numeric codes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "import requests\n",
    "import time\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"../Input/ACSAPIKey.txt\", \"r\") as f:\n",
    "    lines = f.readlines()\n",
    "    CENSUS_API_KEY = lines[1].strip()\n",
    "\n",
    "YEAR = 2024\n",
    "DATASET = \"acs/acs5\"\n",
    "BASE_URL = f\"https://api.census.gov/data/{YEAR}/{DATASET}\"\n",
    "GEOINFO_URL = \"https://api.census.gov/data/2020/geoinfo\"\n",
    "\n",
    "GEO_LEVELS = [\n",
    "    {\n",
    "        \"name\": \"ALZipCodes\",\n",
    "        \"label\": \"Alabama ZIP Codes (ZCTAs)\",\n",
    "        \"for\": \"zip code tabulation area:*\",\n",
    "        \"in\": None,\n",
    "        \"geo_col\": \"zip code tabulation area\",\n",
    "        \"filter_prefix\": [\"35\", \"36\"],\n",
    "    },\n",
    "    {\n",
    "        \"name\": \"ALCounties\",\n",
    "        \"label\": \"Alabama Counties\",\n",
    "        \"for\": \"county:*\",\n",
    "        \"in\": \"state:01\",\n",
    "        \"geo_col\": \"county\",\n",
    "        \"filter_prefix\": None,\n",
    "    },\n",
    "    {\n",
    "        \"name\": \"USStates\",\n",
    "        \"label\": \"U.S. States\",\n",
    "        \"for\": \"state:*\",\n",
    "        \"in\": None,\n",
    "        \"geo_col\": \"state\",\n",
    "        \"filter_prefix\": None,\n",
    "    },\n",
    "]\n",
    "\n",
    "OUTPUT_FILES = {\n",
    "    \"ALZipCodes\": \"ACS_ALZipCodes.csv\",\n",
    "    \"ALCounties\": \"ACS_ALCounties.csv\",\n",
    "    \"USStates\": \"ACS_USStates.csv\",\n",
    "}\n",
    "\n",
    "BATCH_SIZE = 50\n",
    "SLEEP_SECONDS = 0.5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Read the ACS Group IDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 53 ACS table groups.\n",
      "['B01002', 'B01003', 'B02001', 'B05001', 'B05002', 'B05011', 'B05012', 'B06009', 'B06010', 'B06012', 'B08006', 'B08012', 'B08014', 'B08016', 'B08017', 'B08018', 'B08122', 'B08124', 'B08201', 'B08202', 'B08203', 'B08301', 'B08303', 'B15002', 'B15003', 'B15011', 'B17020', 'B19013', 'B19019', 'B19049', 'B19081', 'B19101', 'B19113', 'B19119', 'B19121', 'B19125', 'B19126', 'B19301', 'B23020', 'B23027', 'B25002', 'B25018', 'B25040', 'B25071', 'B25077', 'B25104', 'B25132', 'B25133', 'B28010', 'B28011', 'C15010', 'C17002', 'C24050']\n"
     ]
    }
   ],
   "source": [
    "groups_df = pl.read_csv(\"../Input/ACSGroups_Initial.csv\")\n",
    "groups = groups_df[\"Group\"].to_list()\n",
    "print(f\"Loaded {len(groups)} ACS table groups.\")\n",
    "print(groups)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Define Helper Functions\n",
    "\n",
    "- `get_estimate_variables(group)`: Queries the Census API group endpoint to discover\n",
    "  all estimate variables (ending in `E`, excluding annotation columns ending in `EA` or `M`).\n",
    "- `rename_variable(var)`: Converts Census API variable names (e.g., `B05002_015E`) to\n",
    "  the project convention (e.g., `B05002_E015`).\n",
    "- `download_group(group, geo_level)`: Downloads all estimate variables for one group\n",
    "  at a given geographic level, handling batching and rate limiting.\n",
    "- `download_land_area(geo_level)`: Downloads land area (sq mi) from the Census GEOINFO API.\n",
    "- `download_name_mapping(geo_level)`: Downloads FIPS code to human-readable name mapping.\n",
    "- `build_dictionary(groups)`: Builds a variable dictionary from Census API group metadata."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_estimate_variables(group: str) -> list[str]:\n",
    "    \"\"\"Get sorted list of estimate variable names for a Census table group.\"\"\"\n",
    "    url = f\"{BASE_URL}/groups/{group}.json\"\n",
    "    resp = requests.get(url)\n",
    "    resp.raise_for_status()\n",
    "    variables = resp.json()[\"variables\"]\n",
    "    estimate_vars = sorted(\n",
    "        name\n",
    "        for name in variables\n",
    "        if name.endswith(\"E\")\n",
    "        and not name.endswith(\"EA\")\n",
    "        and not name.endswith(\"M\")\n",
    "        and name != \"NAME\"\n",
    "    )\n",
    "    return estimate_vars\n",
    "\n",
    "\n",
    "def rename_variable(var: str) -> str:\n",
    "    \"\"\"Rename Census API variable to project convention.\n",
    "\n",
    "    B05002_015E -> B05002_E015\n",
    "    \"\"\"\n",
    "    parts = var.rsplit(\"_\", 1)\n",
    "    prefix = parts[0]\n",
    "    suffix = parts[1]  # e.g. '015E'\n",
    "    number = suffix[:-1]  # e.g. '015'\n",
    "    return f\"{prefix}_E{number}\"\n",
    "\n",
    "\n",
    "def download_group(group: str, geo_level: dict) -> pl.DataFrame | None:\n",
    "    \"\"\"Download all estimate variables for one group at one geographic level.\"\"\"\n",
    "    estimate_vars = get_estimate_variables(group)\n",
    "    if not estimate_vars:\n",
    "        print(f\"  No estimate variables found for {group}.\")\n",
    "        return None\n",
    "\n",
    "    geo_col = geo_level[\"geo_col\"]\n",
    "    batches = [\n",
    "        estimate_vars[i : i + BATCH_SIZE]\n",
    "        for i in range(0, len(estimate_vars), BATCH_SIZE)\n",
    "    ]\n",
    "    num_batches = len(batches)\n",
    "    print(\n",
    "        f\"  {group}: {len(estimate_vars)} estimate variables ({num_batches} batch{'es' if num_batches != 1 else ''})\"\n",
    "    )\n",
    "\n",
    "    combined_df = None\n",
    "\n",
    "    for batch_idx, batch_vars in enumerate(batches):\n",
    "        get_vars = \",\".join(batch_vars)\n",
    "        params = {\n",
    "            \"get\": get_vars,\n",
    "            \"for\": geo_level[\"for\"],\n",
    "            \"key\": CENSUS_API_KEY,\n",
    "        }\n",
    "        if geo_level[\"in\"] is not None:\n",
    "            params[\"in\"] = geo_level[\"in\"]\n",
    "\n",
    "        resp = requests.get(BASE_URL, params=params)\n",
    "        resp.raise_for_status()\n",
    "        data = resp.json()\n",
    "\n",
    "        header = data[0]\n",
    "        rows = data[1:]\n",
    "\n",
    "        # Deduplicate header columns (API may return 'state' twice for county queries)\n",
    "        seen = {}\n",
    "        unique_header = []\n",
    "        drop_indices = set()\n",
    "        for i, col in enumerate(header):\n",
    "            if col in seen:\n",
    "                drop_indices.add(i)\n",
    "            else:\n",
    "                seen[col] = i\n",
    "                unique_header.append(col)\n",
    "        if drop_indices:\n",
    "            rows = [\n",
    "                [val for i, val in enumerate(row) if i not in drop_indices]\n",
    "                for row in rows\n",
    "            ]\n",
    "\n",
    "        # Force all columns to Utf8 (Census API returns strings; avoids\n",
    "        # schema inference mismatches on large responses like 33k ZCTAs)\n",
    "        schema = {col: pl.Utf8 for col in unique_header}\n",
    "        batch_df = pl.DataFrame(rows, schema=schema, orient=\"row\")\n",
    "\n",
    "        # Keep only geo_col and estimate variables\n",
    "        keep_cols = [geo_col] + [c for c in batch_vars if c in batch_df.columns]\n",
    "        batch_df = batch_df.select(keep_cols)\n",
    "\n",
    "        if combined_df is None:\n",
    "            combined_df = batch_df\n",
    "        else:\n",
    "            combined_df = combined_df.join(batch_df, on=geo_col, how=\"left\")\n",
    "\n",
    "        time.sleep(SLEEP_SECONDS)\n",
    "\n",
    "    # Filter to specific geography prefixes (e.g. Alabama ZCTAs)\n",
    "    filter_prefix = geo_level.get(\"filter_prefix\")\n",
    "    if filter_prefix and combined_df is not None:\n",
    "        mask = pl.lit(False)\n",
    "        for prefix in filter_prefix:\n",
    "            mask = mask | pl.col(geo_col).str.starts_with(prefix)\n",
    "        combined_df = combined_df.filter(mask)\n",
    "\n",
    "    # Rename variables to project convention\n",
    "    rename_map = {\n",
    "        var: rename_variable(var)\n",
    "        for var in estimate_vars\n",
    "        if var in combined_df.columns\n",
    "    }\n",
    "    combined_df = combined_df.rename(rename_map)\n",
    "\n",
    "    return combined_df\n",
    "\n",
    "\n",
    "def download_land_area(geo_level: dict) -> pl.DataFrame:\n",
    "    \"\"\"Download land area (square miles) from the Census GEOINFO API.\"\"\"\n",
    "    geo_col = geo_level[\"geo_col\"]\n",
    "    params = {\n",
    "        \"get\": \"AREALAND_SQMI\",\n",
    "        \"for\": geo_level[\"for\"],\n",
    "        \"key\": CENSUS_API_KEY,\n",
    "    }\n",
    "    if geo_level[\"in\"] is not None:\n",
    "        params[\"in\"] = geo_level[\"in\"]\n",
    "\n",
    "    resp = requests.get(GEOINFO_URL, params=params)\n",
    "    resp.raise_for_status()\n",
    "    data = resp.json()\n",
    "\n",
    "    header = data[0]\n",
    "    rows = data[1:]\n",
    "\n",
    "    # Deduplicate header columns\n",
    "    seen = {}\n",
    "    unique_header = []\n",
    "    drop_indices = set()\n",
    "    for i, col in enumerate(header):\n",
    "        if col in seen:\n",
    "            drop_indices.add(i)\n",
    "        else:\n",
    "            seen[col] = i\n",
    "            unique_header.append(col)\n",
    "    if drop_indices:\n",
    "        rows = [\n",
    "            [val for i, val in enumerate(row) if i not in drop_indices]\n",
    "            for row in rows\n",
    "        ]\n",
    "\n",
    "    schema = {col: pl.Utf8 for col in unique_header}\n",
    "    df = pl.DataFrame(rows, schema=schema, orient=\"row\")\n",
    "    df = df.select([geo_col, \"AREALAND_SQMI\"])\n",
    "\n",
    "    # Filter by prefix if needed\n",
    "    filter_prefix = geo_level.get(\"filter_prefix\")\n",
    "    if filter_prefix:\n",
    "        mask = pl.lit(False)\n",
    "        for prefix in filter_prefix:\n",
    "            mask = mask | pl.col(geo_col).str.starts_with(prefix)\n",
    "        df = df.filter(mask)\n",
    "\n",
    "    # Keep only the latest row per geography (GEOINFO may return duplicates)\n",
    "    df = df.unique(subset=[geo_col], keep=\"last\")\n",
    "\n",
    "    df = df.rename({\"AREALAND_SQMI\": \"LandArea\"})\n",
    "    df = df.with_columns(pl.col(\"LandArea\").cast(pl.Float64))\n",
    "    return df\n",
    "\n",
    "\n",
    "def download_name_mapping(geo_level: dict) -> dict[str, str]:\n",
    "    \"\"\"Download FIPS code to human-readable name mapping from Census API.\"\"\"\n",
    "    geo_col = geo_level[\"geo_col\"]\n",
    "    params = {\n",
    "        \"get\": \"NAME\",\n",
    "        \"for\": geo_level[\"for\"],\n",
    "        \"key\": CENSUS_API_KEY,\n",
    "    }\n",
    "    if geo_level[\"in\"] is not None:\n",
    "        params[\"in\"] = geo_level[\"in\"]\n",
    "\n",
    "    resp = requests.get(BASE_URL, params=params)\n",
    "    resp.raise_for_status()\n",
    "    data = resp.json()\n",
    "\n",
    "    header = data[0]\n",
    "    rows = data[1:]\n",
    "\n",
    "    # Deduplicate header columns\n",
    "    seen = {}\n",
    "    unique_header = []\n",
    "    drop_indices = set()\n",
    "    for i, col in enumerate(header):\n",
    "        if col in seen:\n",
    "            drop_indices.add(i)\n",
    "        else:\n",
    "            seen[col] = i\n",
    "            unique_header.append(col)\n",
    "    if drop_indices:\n",
    "        rows = [\n",
    "            [val for i, val in enumerate(row) if i not in drop_indices]\n",
    "            for row in rows\n",
    "        ]\n",
    "\n",
    "    name_idx = unique_header.index(\"NAME\")\n",
    "    geo_idx = unique_header.index(geo_col)\n",
    "\n",
    "    return {row[geo_idx]: row[name_idx] for row in rows}\n",
    "\n",
    "\n",
    "def build_dictionary(groups: list[str]) -> pl.DataFrame:\n",
    "    \"\"\"Build a variable dictionary from Census API group metadata.\"\"\"\n",
    "    records = []\n",
    "    for idx, group in enumerate(groups, start=1):\n",
    "        print(f\"[{idx}/{len(groups)}] {group}\", end=\" \")\n",
    "        url = f\"{BASE_URL}/groups/{group}.json\"\n",
    "        resp = requests.get(url)\n",
    "        resp.raise_for_status()\n",
    "        group_data = resp.json()\n",
    "\n",
    "        variables = group_data[\"variables\"]\n",
    "        for var_name, var_info in sorted(variables.items()):\n",
    "            if (\n",
    "                var_name.endswith(\"E\")\n",
    "                and not var_name.endswith(\"EA\")\n",
    "                and not var_name.endswith(\"M\")\n",
    "                and var_name != \"NAME\"\n",
    "            ):\n",
    "                records.append({\n",
    "                    \"Variable\": rename_variable(var_name),\n",
    "                    \"Label\": var_info.get(\"label\", \"\"),\n",
    "                    \"Concept\": var_info.get(\"concept\", \"\"),\n",
    "                    \"Group\": group,\n",
    "                })\n",
    "        time.sleep(SLEEP_SECONDS)\n",
    "        print(f\"ok\")\n",
    "\n",
    "    # Add LandArea and PopulationDensity entries\n",
    "    records.append({\n",
    "        \"Variable\": \"LandArea\",\n",
    "        \"Label\": \"Land area (square miles)\",\n",
    "        \"Concept\": \"GEOGRAPHIC CHARACTERISTICS\",\n",
    "        \"Group\": \"GEOINFO\",\n",
    "    })\n",
    "    records.append({\n",
    "        \"Variable\": \"PopulationDensity\",\n",
    "        \"Label\": \"Land area per capita (square miles per person); LandArea / B01003_E001\",\n",
    "        \"Concept\": \"DERIVED VARIABLE\",\n",
    "        \"Group\": \"DERIVED\",\n",
    "    })\n",
    "\n",
    "    return pl.DataFrame(records)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Download All Groups, Add Land Area, and Resolve Location Names\n",
    "\n",
    "For each geographic level: download all ACS table groups, merge them, cast to\n",
    "numeric, add land area and population density, and replace FIPS codes with\n",
    "human-readable names (states and counties). If the output CSV already exists,\n",
    "it is loaded directly and the download is skipped."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "ACS_ALZipCodes.csv already exists, loading existing data.\n",
      "\n",
      "ACS_ALCounties.csv already exists, loading existing data.\n",
      "\n",
      "ACS_USStates.csv already exists, loading existing data.\n"
     ]
    }
   ],
   "source": [
    "geo_results = {}\n",
    "\n",
    "for geo_level in GEO_LEVELS:\n",
    "    level_name = geo_level[\"name\"]\n",
    "    geo_col = geo_level[\"geo_col\"]\n",
    "    output_file = OUTPUT_FILES[level_name]\n",
    "\n",
    "    # If output file exists, load it and skip the download\n",
    "    if Path(output_file).exists():\n",
    "        print(f\"\\n{output_file} already exists, loading existing data.\")\n",
    "        geo_results[level_name] = pl.read_csv(output_file)\n",
    "        continue\n",
    "\n",
    "    print(f\"\\n{'='*60}\")\n",
    "    print(f\"Downloading: {geo_level['label']}\")\n",
    "    print(f\"{'='*60}\")\n",
    "\n",
    "    group_dfs = []\n",
    "\n",
    "    for idx, group in enumerate(groups, start=1):\n",
    "        print(f\"[{idx}/{len(groups)}]\", end=\" \")\n",
    "        try:\n",
    "            df = download_group(group, geo_level)\n",
    "            if df is not None:\n",
    "                group_dfs.append(df)\n",
    "        except Exception as e:\n",
    "            print(f\"  ERROR downloading {group}: {e}\")\n",
    "\n",
    "    if not group_dfs:\n",
    "        print(f\"\\nNo data downloaded for {level_name}.\")\n",
    "        continue\n",
    "\n",
    "    # Outer-join all group DataFrames on geo column\n",
    "    merged = group_dfs[0]\n",
    "    for df in group_dfs[1:]:\n",
    "        merged = merged.join(df, on=geo_col, how=\"full\", coalesce=True)\n",
    "    merged = merged.sort(geo_col)\n",
    "    print(f\"\\nMerged {level_name}: {merged.shape[0]} locations x {merged.shape[1] - 1} variables\")\n",
    "\n",
    "    # Cast variable columns to Float64 (Census API returns strings)\n",
    "    var_cols = [c for c in merged.columns if c != geo_col]\n",
    "    merged = merged.with_columns(\n",
    "        [pl.col(c).cast(pl.Float64, strict=False) for c in var_cols]\n",
    "    )\n",
    "\n",
    "    # Add land area and population density\n",
    "    print(f\"Downloading land area for {geo_level['label']}...\")\n",
    "    try:\n",
    "        land_area_df = download_land_area(geo_level)\n",
    "        merged = merged.join(land_area_df, on=geo_col, how=\"left\")\n",
    "        merged = merged.with_columns(\n",
    "            (pl.col(\"LandArea\") / pl.col(\"B01003_E001\")).alias(\"PopulationDensity\")\n",
    "        )\n",
    "        print(f\"  Added LandArea and PopulationDensity.\")\n",
    "    except Exception as e:\n",
    "        print(f\"  ERROR downloading land area: {e}\")\n",
    "\n",
    "    # Replace geo codes with human-readable location names\n",
    "    if level_name in (\"ALCounties\", \"USStates\"):\n",
    "        print(f\"Downloading location names for {geo_level['label']}...\")\n",
    "        name_map = download_name_mapping(geo_level)\n",
    "        if level_name == \"ALCounties\":\n",
    "            name_map = {k: v.replace(\", Alabama\", \"\") for k, v in name_map.items()}\n",
    "        merged = merged.with_columns(\n",
    "            pl.col(geo_col).replace(name_map).alias(\"Location\")\n",
    "        ).drop(geo_col)\n",
    "    else:\n",
    "        # ZIP codes are already human-readable\n",
    "        merged = merged.rename({geo_col: \"Location\"})\n",
    "\n",
    "    # Move Location to first column\n",
    "    cols = [\"Location\"] + [c for c in merged.columns if c != \"Location\"]\n",
    "    merged = merged.select(cols)\n",
    "\n",
    "    geo_results[level_name] = merged\n",
    "    print(f\"Final {level_name}: {merged.shape[0]} locations x {merged.shape[1] - 1} variables\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Save to CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved ACS_ALZipCodes.csv: 656 rows x 910 columns\n",
      "Saved ACS_ALCounties.csv: 67 rows x 910 columns\n",
      "Saved ACS_USStates.csv: 52 rows x 910 columns\n"
     ]
    }
   ],
   "source": [
    "for level_name, filename in OUTPUT_FILES.items():\n",
    "    if level_name in geo_results:\n",
    "        df = geo_results[level_name]\n",
    "        df.write_csv(filename)\n",
    "        print(f\"Saved {filename}: {df.shape[0]} rows x {df.shape[1]} columns\")\n",
    "    else:\n",
    "        print(f\"Skipped {filename}: no data available.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Download Variable Dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ACS_Dictionary.csv already exists, skipping dictionary download.\n",
      "\n",
      "Dictionary: 909 variables\n",
      "shape: (5, 4)\n",
      "┌─────────────┬─────────────────────────────────┬───────────────────┬────────┐\n",
      "│ Variable    ┆ Label                           ┆ Concept           ┆ Group  │\n",
      "│ ---         ┆ ---                             ┆ ---               ┆ ---    │\n",
      "│ str         ┆ str                             ┆ str               ┆ str    │\n",
      "╞═════════════╪═════════════════════════════════╪═══════════════════╪════════╡\n",
      "│ B01002_E001 ┆ Estimate!!Median age --!!Total… ┆ Median Age by Sex ┆ B01002 │\n",
      "│ B01002_E002 ┆ Estimate!!Median age --!!Male   ┆ Median Age by Sex ┆ B01002 │\n",
      "│ B01002_E003 ┆ Estimate!!Median age --!!Femal… ┆ Median Age by Sex ┆ B01002 │\n",
      "│ B01003_E001 ┆ Estimate!!Total                 ┆ Total Population  ┆ B01003 │\n",
      "│ B02001_E001 ┆ Estimate!!Total:                ┆ Race              ┆ B02001 │\n",
      "└─────────────┴─────────────────────────────────┴───────────────────┴────────┘\n"
     ]
    }
   ],
   "source": [
    "dict_file = \"ACS_Dictionary.csv\"\n",
    "\n",
    "if Path(dict_file).exists():\n",
    "    print(f\"{dict_file} already exists, skipping dictionary download.\")\n",
    "    dictionary = pl.read_csv(dict_file)\n",
    "else:\n",
    "    print(\"Building variable dictionary from Census API...\")\n",
    "    dictionary = build_dictionary(groups)\n",
    "    dictionary.write_csv(dict_file)\n",
    "    print(f\"\\nSaved {dict_file}: {dictionary.shape[0]} rows x {dictionary.shape[1]} columns\")\n",
    "\n",
    "print(f\"\\nDictionary: {dictionary.shape[0]} variables\")\n",
    "print(dictionary.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Download Summary\n",
      "==================================================\n",
      "  ALZipCodes: 656 locations x 909 variables\n",
      "    Sample locations: [35004, 35005, 35006, 35007, 35010]\n",
      "  ALCounties: 67 locations x 909 variables\n",
      "    Sample locations: ['Autauga County', 'Baldwin County', 'Barbour County', 'Bibb County', 'Blount County']\n",
      "  USStates: 52 locations x 909 variables\n",
      "    Sample locations: ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California']\n",
      "  Dictionary: 909 variables\n"
     ]
    }
   ],
   "source": [
    "print(\"Download Summary\")\n",
    "print(\"=\" * 50)\n",
    "for level_name, df in geo_results.items():\n",
    "    num_locs = df.shape[0]\n",
    "    num_vars = df.shape[1] - 1  # exclude Location column\n",
    "    print(f\"  {level_name}: {num_locs} locations x {num_vars} variables\")\n",
    "    sample_locs = df[\"Location\"].head(5).to_list()\n",
    "    print(f\"    Sample locations: {sample_locs}\")\n",
    "print(f\"  Dictionary: {dictionary.shape[0]} variables\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
